titanic = read.csv('Titanic_train.csv')
## Warning in file(file, "rt"): cannot open file 'Titanic_train.csv': No such
## file or directory
## Error in file(file, "rt"): cannot open the connection
What would be some good features to consider here?
options(width = 110)
head(titanic)
## Error in head(titanic): object 'titanic' not found
sapply(titanic,class)
## Error in lapply(X = X, FUN = FUN, ...): object 'titanic' not found
titanic$Survived = factor(titanic$Survived, labels=c("died", "survived"))
## Error in factor(titanic$Survived, labels = c("died", "survived")): object 'titanic' not found
titanic$Embarked = factor(titanic$Embarked, labels=c("unkown", "Cherbourg", "Queenstown", "Southampton"))
## Error in factor(titanic$Embarked, labels = c("unkown", "Cherbourg", "Queenstown", : object 'titanic' not found
sapply(titanic,class)
## Error in lapply(X = X, FUN = FUN, ...): object 'titanic' not found
str(titanic$Survived)
## Error in str(titanic$Survived): object 'titanic' not found
str(titanic$Sex)
## Error in str(titanic$Sex): object 'titanic' not found
survivedTable = table(titanic$Survived)
## Error in table(titanic$Survived): object 'titanic' not found
survivedTable
## Error in eval(expr, envir, enclos): object 'survivedTable' not found
par(mar = c(0, 0, 0, 0), oma = c(0, 0, 0, 0))
pie(survivedTable,labels=c("Died","Survived"))
## Error in pie(survivedTable, labels = c("Died", "Survived")): object 'survivedTable' not found
male = titanic[titanic$Sex=="male",]
## Error in eval(expr, envir, enclos): object 'titanic' not found
female = titanic[titanic$Sex=="female",]
## Error in eval(expr, envir, enclos): object 'titanic' not found
par(mfrow = c(1, 2), mar = c(0, 0, 2, 0), oma = c(0, 1, 0, 1))
pie(table(male$Survived),labels=c("Dead","Survived"), main="Survival Portion Among Men")
## Error in table(male$Survived): object 'male' not found
pie(table(female$Survived),labels=c("Dead","Survived"), main="Survival Portion Among Women")
## Error in table(female$Survived): object 'female' not found
Age <- titanic$Age; summary(Age)
## Error in eval(expr, envir, enclos): object 'titanic' not found
## Error in summary(Age): object 'Age' not found
How about summary segmented by survival
summary(titanic[titanic$Survived=="died",]$Age)
## Error in summary(titanic[titanic$Survived == "died", ]$Age): object 'titanic' not found
summary(titanic[titanic$Survived=="survived",]$Age)
## Error in summary(titanic[titanic$Survived == "survived", ]$Age): object 'titanic' not found
par(mfrow = c(1, 2), mar = c(4, 4, 2, 2), oma = c(1, 1, 1, 1))
boxplot(titanic$Age~titanic$Sex, main="Age Distribution By Gender",col=c("red","green"))
## Error in eval(expr, envir, enclos): object 'titanic' not found
boxplot(titanic$Age~titanic$Survived, main="Age Distribution By Survival",col=c("red","green"),
xlab="0:Died 1:Survived",ylab="Age")
## Error in eval(expr, envir, enclos): object 'titanic' not found
hist(Age, col="blue", xlab="Age", ylab="Frequency",
main = "Distribution of Passenger Ages on Titanic")
## Error in hist(Age, col = "blue", xlab = "Age", ylab = "Frequency", main = "Distribution of Passenger Ages on Titanic"): object 'Age' not found
d = density(na.omit(Age)) # density(Age) won't work, need to omit all NAs
## Error in na.omit(Age): object 'Age' not found
plot(d, main = "kernel density of Ages of Titanic Passengers")
## Error in plot(d, main = "kernel density of Ages of Titanic Passengers"): object 'd' not found
polygon(d, col="red", border="blue")
## Error in xy.coords(x, y): object 'd' not found
## Error in na.omit(titanic): object 'titanic' not found
## Package 'sm', version 2.2-5.4: type help(sm) for summary information
## Error in is.vector(x): object 'titanic_na_removed' not found
## Error in title(main = "Kernel Density Plot of Ages By Sex"): plot.new has not been called yet
## Error in levels(titanic_na_removed$Sex): object 'titanic_na_removed' not found
## Error in levels(titanic_na_removed$Sex): object 'titanic_na_removed' not found
## Error in plot(d, main = "kernel density of Ages of Titanic Passengers", : object 'd' not found
## Error in xy.coords(x, y): object 'd' not found
## Error in is.vector(x): object 'titanic_na_removed' not found
## Error in title(main = "Kernel Density Plot of Ages By Sex", cex.main = 3): plot.new has not been called yet
## Error in levels(titanic_na_removed$Sex): object 'titanic_na_removed' not found
## Error in levels(titanic_na_removed$Sex): object 'titanic_na_removed' not found
## Error in is.vector(x): object 'titanic_na_removed' not found
## Error in title(main = "Kernel Density Plot of Ages By Survival", cex.main = 3): plot.new has not been called yet
## Error in levels(titanic_na_removed$Survived): object 'titanic_na_removed' not found
## Error in levels(titanic_na_removed$Survived): object 'titanic_na_removed' not found
An example of feature engineering!
## Multi dimensional comparison
Child <- titanic$Age # Isolating age.
## Error in eval(expr, envir, enclos): object 'titanic' not found
## Now we need to create categories: NA = Unknown, 1 = Child, 2 = Adult
## Every age below 13 (exclusive) is classified into age group 1
Child[Child<13] <- 1
## Error in Child[Child < 13] <- 1: object 'Child' not found
## Every child 13 or above is classified into age group 2
Child[Child>=13] <- 2
## Error in Child[Child >= 13] <- 2: object 'Child' not found
# Use labels instead of 0's and 1's
Child[Child==1] <- "Child"
## Error in Child[Child == 1] <- "Child": object 'Child' not found
Child[Child==2] <- "Adult"
## Error in Child[Child == 2] <- "Adult": object 'Child' not found
# Appends the new column to the titanic dataset
titanic_with_child_column <- cbind(titanic, Child)
## Error in cbind(titanic, Child): object 'titanic' not found
# Removes rows where age is NA
titanic_with_child_column <- titanic_with_child_column[!is.na(titanic_with_child_column$Child),]
## Error in eval(expr, envir, enclos): object 'titanic_with_child_column' not found
## Error in ggplot(titanic_with_child_column, aes(y = Fare, x = Survived)): object 'titanic_with_child_column' not found
## Error in is.factor(x): object 'titanic' not found
## Error in ggplot(titanic, aes(y = Fare, x = Pclass)): object 'titanic' not found
data(diamonds) # loading diamonds data set
head(diamonds, 16) # first few rows of diamond data set
## carat cut color clarity depth table price x y z
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 338 4.00 4.05 2.39
## 11 0.30 Good J SI1 64.0 55 339 4.25 4.28 2.73
## 12 0.23 Ideal J VS1 62.8 56 340 3.93 3.90 2.46
## 13 0.22 Premium F SI1 60.4 61 342 3.88 3.84 2.33
## 14 0.31 Ideal J SI2 62.2 54 344 4.35 4.37 2.71
## 15 0.20 Premium E SI2 60.2 62 345 3.79 3.75 2.27
## 16 0.32 Premium E I1 60.9 58 345 4.38 4.42 2.68
library(ggplot2)
ggplot(data=diamonds) + geom_histogram(aes(x=carat))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
ggplot(data=diamonds) +
geom_density(aes(x=carat),fill="gray50")
ggplot(diamonds, aes(x=carat,y=price)) + geom_point()
g = ggplot(diamonds, aes(x=carat, y=price)) # saving first layer as variable
g + geom_point(aes(color=color)) # rendering first layer and adding another layer
g + geom_point(aes(color=color)) + facet_wrap(~color)
What is your knowledge of diamond's price after exploring this data?